# Project: Serbia Survey
# Authors: William O'Brochta and Patrick Cunha Silva
# Goal: STM Analysis on FB posts (Entire Corpus), Within Analysis (TWFEs)

rm(list = ls())

# Load Data
load(file = here('fb_data.RData'))

###########################################################
################## SI B.8 STM: TWFEs ######################
###########################################################

# Prepare corpus
mystopwords <- c('s', '#39', '<', '>', 'rs', 'www', 
                 '8d', 'iä', 'org', 'https', 'http', 'ä', 'dr', 't', 
                 stopwords('english'), stopwords('russian'))
doc_pros <- textProcessor(fb_data$translation, 
                          metadata = fb_data[, c('inCyrillic', 'owner', 
                                                         'leg_election', 'prez_election', 'Facebook.Id', 'year', 'id', 'date')], 
                          language = 'english',
                          removestopwords = TRUE, 
                          removenumbers = TRUE, 
                          removepunctuation = TRUE,
                          lowercase = TRUE, 
                          stem = FALSE, 
                          striphtml = TRUE, 
                          customstopwords = mystopwords,
                          onlycharacter = TRUE)

# Plot Removed Words
plotRemoved(doc_pros[[1]], lower.thresh = seq(from = 10, to = 1000, by = 10)) 

# Remove Words that appear in less than 1% of the documents or in more than 90% or less than 1%
stmObj <- prepDocuments(doc_pros[[1]], doc_pros[[2]], meta=doc_pros$meta
                        , lower.thresh = round(nrow(fb_data) * 0.01), 
                        upper.thresh = round(nrow(fb_data) * 0.90))

# Create object to run model
docs <- stmObj$documents
vocab <- stmObj$vocab
meta <- stmObj$meta

#--------------- Uncomment Lines below to run the model again -------------#
# Note, we use 15 topics to make it consistent with the main analysis
# meta$Facebook.Id <- as.character(meta$Facebook.Id)
# 
# # Run STM 
# matDoc <- stm(documents = docs,
#     vocab = vocab,
#     K = 15,
#     data = meta,
# 	  prevalence = ~ inCyrillic + leg_election + prez_election + year + Facebook.Id,
#     init.type = 'Spectral'
# )
# 
# # Save model
# save(matDoc, file = here('outModel_fe.RData'))

# Load model 
load(file = here('outModel_fe.RData'))

# Label these topics
labelTopics(matDoc)

# Highest Prob: are the words within each topic with the highest probability 
# FREX: are the words that are both frequent and exclusive

# Highly associated documents
findThoughts(matDoc, texts = fb_data$translation[fb_data$id %in% meta$id], n = 2)

##### Topic Names
# Topic 1: Appearances on Media
# Topic 2: European Integration
# Topic 3: Local Governments
# Topic 4: Elections, Politics
# Topic 5: Belgrade
# Topic 6: Kosovo
# Topic 7: Opinions and General Talk
# Topic 8: Serbian Heritage
# Topic 9: Education, Schools
# Topic 10: Covid-19 (Economy)
# Topic 11: Government Affairs
# Topic 12: Environmental, Human, and Women's Right
# Topic 13: International Cooperation
# Topic 14: Serbian Presidency
# Topic 15: Police, Military

# Estimated Effect of Cyrillic
meta$Facebook.Id <- as.character(meta$Facebook.Id)
meta$year  <- as.character(meta$year)
out <- estimateEffect(formula = 1:15~ inCyrillic + leg_election + prez_election + Facebook.Id + year,  
                      metadata = meta, stmobj = matDoc, uncertainty = 'Global', prior = 1e-5)

# Create Figure
par(mar = c(5, 16, 1, 1))
plot(out, covariate = 'inCyrillic', 
     cov.value1 = 1, cov.value2 = 0, 
     method = 'difference', 
     labeltype = 'custom', 
     ci.level = 0.95,
     axes = FALSE,
     xlim = c(-0.025, 0.025),
     custom.labels = '',
     xlab = 'Difference in Topic Proportion',
     cex.lab = 1.5
)
mylabels <- c ("Media Appearances"
               , "European Integration"
               , "Local Governments"
               , "Elections/Politics"
               , "Belgrade"
               , "Kosovo"
               , "Opinions and General Talk"
               , "Serbian Heritage"
               , "Education, Schools"
               , "Economy/Covid-19"
               , "Government Affairs"
               , "Environmental, Human,\nand Women's Rights"
               , "International Cooperation"
               , "Serbian Presidency and SNS"
               , "Police, Military")
axis(2, at = 1:15, labels = rev(mylabels), las = 2, cex.axis = 1.25)
axis(1, at = seq(-0.05, 0.05, by = 0.025), cex.axis = 1.25)

# Create tables, we will do 3 tables, each will have 5 topics.
toTable <- matrix(NA, ncol = 5, nrow = 11) # Topics 1-5
toTable[seq(1, 8, 2), 1] <- round(unname(summary(out)[['tables']][[1]][1:4, 'Estimate']), 3) # Betas 1
toTable[seq(1, 8, 2), 2] <- round(unname(summary(out)[['tables']][[2]][1:4, 'Estimate']), 3) # Betas 2
toTable[seq(1, 8, 2), 3] <- round(unname(summary(out)[['tables']][[3]][1:4, 'Estimate']), 3) # Betas 3
toTable[seq(1, 8, 2), 4] <- round(unname(summary(out)[['tables']][[4]][1:4, 'Estimate']), 3) # Betas 4
toTable[seq(1, 8, 2), 5] <- round(unname(summary(out)[['tables']][[5]][1:4, 'Estimate']), 3) # Betas 5
toTable[seq(2, 9, 2), 1] <- paste0('(', round(unname(summary(out)[['tables']][[1]][1:4, 'Std. Error']), 3), ')') # SE 1
toTable[seq(2, 9, 2), 2] <- paste0('(', round(unname(summary(out)[['tables']][[2]][1:4, 'Std. Error']), 3), ')') # SE 2
toTable[seq(2, 9, 2), 3] <- paste0('(', round(unname(summary(out)[['tables']][[3]][1:4, 'Std. Error']), 3), ')') # SE 4
toTable[seq(2, 9, 2), 4] <- paste0('(', round(unname(summary(out)[['tables']][[4]][1:4, 'Std. Error']), 3), ')') # SE 4
toTable[seq(2, 9, 2), 5] <- paste0('(', round(unname(summary(out)[['tables']][[5]][1:4, 'Std. Error']), 3), ')') # SE 5
toTable[9, ] <- 'Yes'
toTable[10, ] <- 'Yes'
toTable[11, ] <- nrow(meta)
colnames(toTable) <- paste("Topic", 1:5)
rownames(toTable) <- c('Intercept', '', 'Cyrillic', '',
                       'Legislative Election', '', 'Presidential Election', '',
                       'FE by Year', 'FE by Page', 'N')
stargazer::stargazer(toTable, style = 'apsr', type = 'text')

toTable <- matrix(NA, ncol = 5, nrow = 11) # Topics 6-10
toTable[seq(1, 8, 2), 1] <- round(unname(summary(out)[['tables']][[6]][1:4, 'Estimate']), 3) # Betas 6
toTable[seq(1, 8, 2), 2] <- round(unname(summary(out)[['tables']][[7]][1:4, 'Estimate']), 3) # Betas 7
toTable[seq(1, 8, 2), 3] <- round(unname(summary(out)[['tables']][[8]][1:4, 'Estimate']), 3) # Betas 8
toTable[seq(1, 8, 2), 4] <- round(unname(summary(out)[['tables']][[9]][1:4, 'Estimate']), 3) # Betas 9
toTable[seq(1, 8, 2), 5] <- round(unname(summary(out)[['tables']][[10]][1:4, 'Estimate']), 3) # Betas 10
toTable[seq(2, 9, 2), 1] <- paste0('(', round(unname(summary(out)[['tables']][[6]][1:4, 'Std. Error']), 3), ')') # SE 6
toTable[seq(2, 9, 2), 2] <- paste0('(', round(unname(summary(out)[['tables']][[7]][1:4, 'Std. Error']), 3), ')') # SE 7
toTable[seq(2, 9, 2), 3] <- paste0('(', round(unname(summary(out)[['tables']][[8]][1:4, 'Std. Error']), 3), ')') # SE 8
toTable[seq(2, 9, 2), 4] <- paste0('(', round(unname(summary(out)[['tables']][[9]][1:4, 'Std. Error']), 3), ')') # SE 9
toTable[seq(2, 9, 2), 5] <- paste0('(', round(unname(summary(out)[['tables']][[10]][1:4, 'Std. Error']), 3), ')') # SE 10
toTable[9, ] <- 'Yes'
toTable[10, ] <- 'Yes'
toTable[11, ] <- nrow(meta)
colnames(toTable) <- paste("Topic", 6:10)
rownames(toTable) <- c('Intercept', '', 'Cyrillic', '',
                       'Legislative Election', '', 'Presidential Election', '',
                       'FE by Year', 'FE by Page', 'N')
stargazer::stargazer(toTable, style = 'apsr', type = 'text')

toTable <- matrix(NA, ncol = 5, nrow = 11) # Topics 11-15
toTable[seq(1, 8, 2), 1] <- round(unname(summary(out)[['tables']][[11]][1:4, 'Estimate']), 3) # Betas 11
toTable[seq(1, 8, 2), 2] <- round(unname(summary(out)[['tables']][[12]][1:4, 'Estimate']), 3) # Betas 12
toTable[seq(1, 8, 2), 3] <- round(unname(summary(out)[['tables']][[13]][1:4, 'Estimate']), 3) # Betas 13
toTable[seq(1, 8, 2), 4] <- round(unname(summary(out)[['tables']][[14]][1:4, 'Estimate']), 3) # Betas 14
toTable[seq(1, 8, 2), 5] <- round(unname(summary(out)[['tables']][[15]][1:4, 'Estimate']), 3) # Betas 15
toTable[seq(2, 9, 2), 1] <- paste0('(', round(unname(summary(out)[['tables']][[11]][1:4, 'Std. Error']), 3), ')') # SE 11
toTable[seq(2, 9, 2), 2] <- paste0('(', round(unname(summary(out)[['tables']][[12]][1:4, 'Std. Error']), 3), ')') # SE 12
toTable[seq(2, 9, 2), 3] <- paste0('(', round(unname(summary(out)[['tables']][[13]][1:4, 'Std. Error']), 3), ')') # SE 13
toTable[seq(2, 9, 2), 4] <- paste0('(', round(unname(summary(out)[['tables']][[14]][1:4, 'Std. Error']), 3), ')') # SE 14
toTable[seq(2, 9, 2), 5] <- paste0('(', round(unname(summary(out)[['tables']][[15]][1:4, 'Std. Error']), 3), ')') # SE 15
toTable[9, ] <- 'Yes'
toTable[10, ] <- 'Yes'
toTable[11, ] <- nrow(meta)
colnames(toTable) <- paste("Topic", 11:15)
rownames(toTable) <- c('Intercept', '', 'Cyrillic', '',
                       'Legislative Election', '', 'Presidential Election', '',
                       'FE by Year', 'FE by Page', 'N')
stargazer::stargazer(toTable, style = 'apsr', type = 'text')



